Automate emotion analysis of textual comments and feedback¶
Importing necessary packages¶
In [1]:
!pip install nlp
!pip install datasets
import tensorflow as tf
import numpy as np
import pandas as pd
from wordcloud import WordCloud
import seaborn as sns
%matplotlib inline
import matplotlib.pyplot as plt
import plotly.express as px
import plotly.graph_objects as go
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import nlp
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from keras.layers import Dense, Dropout
from keras.layers import LSTM
from keras.models import Sequential
from keras.layers import Embedding
from keras.layers import Flatten
from keras.layers import Bidirectional
from keras.callbacks import EarlyStopping
from keras.layers import GlobalAvgPool1D
import random
Requirement already satisfied: nlp in c:\users\mayank\appdata\local\programs\python\python312\lib\site-packages (0.4.0) Requirement already satisfied: numpy in c:\users\mayank\appdata\local\programs\python\python312\lib\site-packages (from nlp) (1.26.2) Requirement already satisfied: pyarrow>=0.16.0 in c:\users\mayank\appdata\local\programs\python\python312\lib\site-packages (from nlp) (16.0.0) Requirement already satisfied: dill in c:\users\mayank\appdata\local\programs\python\python312\lib\site-packages (from nlp) (0.3.8) Requirement already satisfied: pandas in c:\users\mayank\appdata\local\programs\python\python312\lib\site-packages (from nlp) (2.2.2) Requirement already satisfied: requests>=2.19.0 in c:\users\mayank\appdata\local\programs\python\python312\lib\site-packages (from nlp) (2.31.0) Requirement already satisfied: tqdm>=4.27 in c:\users\mayank\appdata\local\programs\python\python312\lib\site-packages (from nlp) (4.66.2) Requirement already satisfied: filelock in c:\users\mayank\appdata\local\programs\python\python312\lib\site-packages (from nlp) (3.14.0) Requirement already satisfied: xxhash in c:\users\mayank\appdata\local\programs\python\python312\lib\site-packages (from nlp) (3.4.1) Requirement already satisfied: charset-normalizer<4,>=2 in c:\users\mayank\appdata\local\programs\python\python312\lib\site-packages (from requests>=2.19.0->nlp) (3.3.2) Requirement already satisfied: idna<4,>=2.5 in c:\users\mayank\appdata\local\programs\python\python312\lib\site-packages (from requests>=2.19.0->nlp) (3.6) Requirement already satisfied: urllib3<3,>=1.21.1 in c:\users\mayank\appdata\local\programs\python\python312\lib\site-packages (from requests>=2.19.0->nlp) (2.1.0) Requirement already satisfied: certifi>=2017.4.17 in c:\users\mayank\appdata\local\programs\python\python312\lib\site-packages (from requests>=2.19.0->nlp) (2023.11.17) Requirement already satisfied: colorama in c:\users\mayank\appdata\roaming\python\python312\site-packages (from tqdm>=4.27->nlp) (0.4.6) Requirement already satisfied: python-dateutil>=2.8.2 in c:\users\mayank\appdata\roaming\python\python312\site-packages (from pandas->nlp) (2.8.2) Requirement already satisfied: pytz>=2020.1 in c:\users\mayank\appdata\local\programs\python\python312\lib\site-packages (from pandas->nlp) (2024.1) Requirement already satisfied: tzdata>=2022.7 in c:\users\mayank\appdata\local\programs\python\python312\lib\site-packages (from pandas->nlp) (2024.1) Requirement already satisfied: six>=1.5 in c:\users\mayank\appdata\roaming\python\python312\site-packages (from python-dateutil>=2.8.2->pandas->nlp) (1.16.0) Requirement already satisfied: datasets in c:\users\mayank\appdata\local\programs\python\python312\lib\site-packages (2.19.0) Requirement already satisfied: filelock in c:\users\mayank\appdata\local\programs\python\python312\lib\site-packages (from datasets) (3.14.0) Requirement already satisfied: numpy>=1.17 in c:\users\mayank\appdata\local\programs\python\python312\lib\site-packages (from datasets) (1.26.2) Requirement already satisfied: pyarrow>=12.0.0 in c:\users\mayank\appdata\local\programs\python\python312\lib\site-packages (from datasets) (16.0.0) Requirement already satisfied: pyarrow-hotfix in c:\users\mayank\appdata\local\programs\python\python312\lib\site-packages (from datasets) (0.6) Requirement already satisfied: dill<0.3.9,>=0.3.0 in c:\users\mayank\appdata\local\programs\python\python312\lib\site-packages (from datasets) (0.3.8) Requirement already satisfied: pandas in c:\users\mayank\appdata\local\programs\python\python312\lib\site-packages (from datasets) (2.2.2) Requirement already satisfied: requests>=2.19.0 in c:\users\mayank\appdata\local\programs\python\python312\lib\site-packages (from datasets) (2.31.0) Requirement already satisfied: tqdm>=4.62.1 in c:\users\mayank\appdata\local\programs\python\python312\lib\site-packages (from datasets) (4.66.2) Requirement already satisfied: xxhash in c:\users\mayank\appdata\local\programs\python\python312\lib\site-packages (from datasets) (3.4.1) Requirement already satisfied: multiprocess in c:\users\mayank\appdata\local\programs\python\python312\lib\site-packages (from datasets) (0.70.16) Requirement already satisfied: fsspec<=2024.3.1,>=2023.1.0 in c:\users\mayank\appdata\local\programs\python\python312\lib\site-packages (from fsspec[http]<=2024.3.1,>=2023.1.0->datasets) (2024.3.1) Requirement already satisfied: aiohttp in c:\users\mayank\appdata\local\programs\python\python312\lib\site-packages (from datasets) (3.9.5) Requirement already satisfied: huggingface-hub>=0.21.2 in c:\users\mayank\appdata\local\programs\python\python312\lib\site-packages (from datasets) (0.23.0) Requirement already satisfied: packaging in c:\users\mayank\appdata\roaming\python\python312\site-packages (from datasets) (23.2) Requirement already satisfied: pyyaml>=5.1 in c:\users\mayank\appdata\local\programs\python\python312\lib\site-packages (from datasets) (6.0.1) Requirement already satisfied: aiosignal>=1.1.2 in c:\users\mayank\appdata\local\programs\python\python312\lib\site-packages (from aiohttp->datasets) (1.3.1) Requirement already satisfied: attrs>=17.3.0 in c:\users\mayank\appdata\local\programs\python\python312\lib\site-packages (from aiohttp->datasets) (23.1.0) Requirement already satisfied: frozenlist>=1.1.1 in c:\users\mayank\appdata\local\programs\python\python312\lib\site-packages (from aiohttp->datasets) (1.4.1) Requirement already satisfied: multidict<7.0,>=4.5 in c:\users\mayank\appdata\local\programs\python\python312\lib\site-packages (from aiohttp->datasets) (6.0.5) Requirement already satisfied: yarl<2.0,>=1.0 in c:\users\mayank\appdata\local\programs\python\python312\lib\site-packages (from aiohttp->datasets) (1.9.4) Requirement already satisfied: typing-extensions>=3.7.4.3 in c:\users\mayank\appdata\local\programs\python\python312\lib\site-packages (from huggingface-hub>=0.21.2->datasets) (4.11.0) Requirement already satisfied: charset-normalizer<4,>=2 in c:\users\mayank\appdata\local\programs\python\python312\lib\site-packages (from requests>=2.19.0->datasets) (3.3.2) Requirement already satisfied: idna<4,>=2.5 in c:\users\mayank\appdata\local\programs\python\python312\lib\site-packages (from requests>=2.19.0->datasets) (3.6) Requirement already satisfied: urllib3<3,>=1.21.1 in c:\users\mayank\appdata\local\programs\python\python312\lib\site-packages (from requests>=2.19.0->datasets) (2.1.0) Requirement already satisfied: certifi>=2017.4.17 in c:\users\mayank\appdata\local\programs\python\python312\lib\site-packages (from requests>=2.19.0->datasets) (2023.11.17) Requirement already satisfied: colorama in c:\users\mayank\appdata\roaming\python\python312\site-packages (from tqdm>=4.62.1->datasets) (0.4.6) Requirement already satisfied: python-dateutil>=2.8.2 in c:\users\mayank\appdata\roaming\python\python312\site-packages (from pandas->datasets) (2.8.2) Requirement already satisfied: pytz>=2020.1 in c:\users\mayank\appdata\local\programs\python\python312\lib\site-packages (from pandas->datasets) (2024.1) Requirement already satisfied: tzdata>=2022.7 in c:\users\mayank\appdata\local\programs\python\python312\lib\site-packages (from pandas->datasets) (2024.1) Requirement already satisfied: six>=1.5 in c:\users\mayank\appdata\roaming\python\python312\site-packages (from python-dateutil>=2.8.2->pandas->datasets) (1.16.0)
The datasets library from Hugging Face provides easy access to a wide range of datasets that are commonly used for NLP tasks.¶
It offers functionalities to download, preprocess, and work with datasets seamlessly, allowing users to load datasets easily into their machine learning or deep learning pipelines.¶
In [2]:
pip install dataset
Requirement already satisfied: dataset in c:\users\mayank\appdata\local\programs\python\python312\lib\site-packages (1.6.2) Requirement already satisfied: sqlalchemy<2.0.0,>=1.3.2 in c:\users\mayank\appdata\local\programs\python\python312\lib\site-packages (from dataset) (1.4.52) Requirement already satisfied: alembic>=0.6.2 in c:\users\mayank\appdata\local\programs\python\python312\lib\site-packages (from dataset) (1.13.1) Requirement already satisfied: banal>=1.0.1 in c:\users\mayank\appdata\local\programs\python\python312\lib\site-packages (from dataset) (1.0.6) Requirement already satisfied: Mako in c:\users\mayank\appdata\local\programs\python\python312\lib\site-packages (from alembic>=0.6.2->dataset) (1.3.3) Requirement already satisfied: typing-extensions>=4 in c:\users\mayank\appdata\local\programs\python\python312\lib\site-packages (from alembic>=0.6.2->dataset) (4.11.0) Requirement already satisfied: greenlet!=0.4.17 in c:\users\mayank\appdata\local\programs\python\python312\lib\site-packages (from sqlalchemy<2.0.0,>=1.3.2->dataset) (3.0.3) Requirement already satisfied: MarkupSafe>=0.9.2 in c:\users\mayank\appdata\local\programs\python\python312\lib\site-packages (from Mako->alembic>=0.6.2->dataset) (2.1.3) Note: you may need to restart the kernel to use updated packages.
Importing the Dataset¶
In [3]:
from datasets import load_dataset
data = load_dataset('emotion')
C:\Users\Mayank\AppData\Local\Programs\Python\Python312\Lib\site-packages\datasets\load.py:1486: FutureWarning: The repository for emotion contains custom code which must be executed to correctly load the dataset. You can inspect the repository content at https://hf.co/datasets/emotion You can avoid this message in future by passing the argument `trust_remote_code=True`. Passing `trust_remote_code=True` will be mandatory to load this dataset from the next major release of `datasets`. warnings.warn(
In [4]:
# Converting the train, validation and test datasets into DataFrame format
train = pd.DataFrame(data['train'])
validation = pd.DataFrame(data['validation'])
test = pd.DataFrame(data['test'])
In [5]:
train.head(10)
Out[5]:
| text | label | |
|---|---|---|
| 0 | i didnt feel humiliated | 0 |
| 1 | i can go from feeling so hopeless to so damned... | 0 |
| 2 | im grabbing a minute to post i feel greedy wrong | 3 |
| 3 | i am ever feeling nostalgic about the fireplac... | 2 |
| 4 | i am feeling grouchy | 3 |
| 5 | ive been feeling a little burdened lately wasn... | 0 |
| 6 | ive been taking or milligrams or times recomme... | 5 |
| 7 | i feel as confused about life as a teenager or... | 4 |
| 8 | i have been with petronas for years i feel tha... | 1 |
| 9 | i feel romantic too | 2 |
In [6]:
train['label'].unique()
Out[6]:
array([0, 3, 2, 5, 4, 1], dtype=int64)
Distribution of the Length of the Texts¶
In [7]:
train['length_of_text'] = [len(i.split(' ')) for i in train['text']]
fig = px.histogram(train['length_of_text'], marginal='box',
labels={"value": "Length of the Text"})
fig.update_traces(marker=dict(line=dict(color='#000000', width=2)))
fig.update_layout(title_text='Distribution of the Length of the Texts',
title_x=0.5, title_font=dict(size=22))
fig.show()
Distribution of the Length of the Texts by Emotions¶
In [8]:
fig = px.histogram(train['length_of_text'], marginal='box',
labels={"value": "Length of the Text"},
color=train['label'])
fig.update_traces(marker=dict(line=dict(color='#000000', width=2)))
fig.update_layout(title_text='Distribution of the Length of the Texts by Emotions',
title_x=0.5, title_font=dict(size=22))
fig.show()
Distribution of the Labels¶
In [9]:
fig = px.histogram(train, x='label', color='label')
fig.update_traces(marker=dict(line=dict(color='#000000', width=2)))
fig.update_layout(title_text='Distribution of the Labels',
title_x=0.5, title_font=dict(size=22))
fig.show()
In [10]:
FreqOfWords = train['text'].str.split(expand=True).stack().value_counts()
FreqOfWords_top200 = FreqOfWords[:200]
top_words_df = pd.DataFrame({'word': FreqOfWords_top200.index, 'frequency': FreqOfWords_top200.values})
import plotly.express as px
fig = px.treemap(top_words_df, path=['word'], values='frequency')
fig.update_layout(title_text='Frequency of the Words in the Train Dataset', title_x=0.5, title_font=dict(size=22))
fig.update_traces(textinfo="label+value")
fig.show()
Tokenizing with NLTK¶
In [11]:
import nltk
nltk.download('punkt')
[nltk_data] Downloading package punkt to [nltk_data] C:\Users\Mayank\AppData\Roaming\nltk_data... [nltk_data] Package punkt is already up-to-date!
Out[11]:
True
In [12]:
def tokenization(inputs):
return word_tokenize(inputs) #REFERENCE[1]
train['text_tokenized'] = train['text'].apply(tokenization)
validation['text_tokenized'] = validation['text'].apply(tokenization)
In [13]:
train.head()
Out[13]:
| text | label | length_of_text | text_tokenized | |
|---|---|---|---|---|
| 0 | i didnt feel humiliated | 0 | 4 | [i, didnt, feel, humiliated] |
| 1 | i can go from feeling so hopeless to so damned... | 0 | 21 | [i, can, go, from, feeling, so, hopeless, to, ... |
| 2 | im grabbing a minute to post i feel greedy wrong | 3 | 10 | [im, grabbing, a, minute, to, post, i, feel, g... |
| 3 | i am ever feeling nostalgic about the fireplac... | 2 | 18 | [i, am, ever, feeling, nostalgic, about, the, ... |
| 4 | i am feeling grouchy | 3 | 4 | [i, am, feeling, grouchy] |
Stopwords Removal¶
In [14]:
import nltk
nltk.download('stopwords')
[nltk_data] Downloading package stopwords to [nltk_data] C:\Users\Mayank\AppData\Roaming\nltk_data... [nltk_data] Package stopwords is already up-to-date!
Out[14]:
True
In [15]:
stop_words = set(stopwords.words('english'))
def stopwords_remove(inputs):
return [item for item in inputs if item not in stop_words]
train['text_stop'] = train['text_tokenized'].apply(stopwords_remove)
validation['text_stop'] = validation['text_tokenized'].apply(stopwords_remove)
train.head()
Out[15]:
| text | label | length_of_text | text_tokenized | text_stop | |
|---|---|---|---|---|---|
| 0 | i didnt feel humiliated | 0 | 4 | [i, didnt, feel, humiliated] | [didnt, feel, humiliated] |
| 1 | i can go from feeling so hopeless to so damned... | 0 | 21 | [i, can, go, from, feeling, so, hopeless, to, ... | [go, feeling, hopeless, damned, hopeful, aroun... |
| 2 | im grabbing a minute to post i feel greedy wrong | 3 | 10 | [im, grabbing, a, minute, to, post, i, feel, g... | [im, grabbing, minute, post, feel, greedy, wrong] |
| 3 | i am ever feeling nostalgic about the fireplac... | 2 | 18 | [i, am, ever, feeling, nostalgic, about, the, ... | [ever, feeling, nostalgic, fireplace, know, st... |
| 4 | i am feeling grouchy | 3 | 4 | [i, am, feeling, grouchy] | [feeling, grouchy] |
Lemmatization¶
In [16]:
import nltk
nltk.download('wordnet')
[nltk_data] Downloading package wordnet to [nltk_data] C:\Users\Mayank\AppData\Roaming\nltk_data... [nltk_data] Package wordnet is already up-to-date!
Out[16]:
True
In [17]:
lemmatizer = WordNetLemmatizer()
def lemmatization(inputs):
return [lemmatizer.lemmatize(word=x, pos='v') for x in inputs]
train['text_lemmatized'] = train['text_stop'].apply(lemmatization)
validation['text_lemmatized'] = validation['text_stop'].apply(lemmatization)
train.head()
Out[17]:
| text | label | length_of_text | text_tokenized | text_stop | text_lemmatized | |
|---|---|---|---|---|---|---|
| 0 | i didnt feel humiliated | 0 | 4 | [i, didnt, feel, humiliated] | [didnt, feel, humiliated] | [didnt, feel, humiliate] |
| 1 | i can go from feeling so hopeless to so damned... | 0 | 21 | [i, can, go, from, feeling, so, hopeless, to, ... | [go, feeling, hopeless, damned, hopeful, aroun... | [go, feel, hopeless, damn, hopeful, around, so... |
| 2 | im grabbing a minute to post i feel greedy wrong | 3 | 10 | [im, grabbing, a, minute, to, post, i, feel, g... | [im, grabbing, minute, post, feel, greedy, wrong] | [im, grab, minute, post, feel, greedy, wrong] |
| 3 | i am ever feeling nostalgic about the fireplac... | 2 | 18 | [i, am, ever, feeling, nostalgic, about, the, ... | [ever, feeling, nostalgic, fireplace, know, st... | [ever, feel, nostalgic, fireplace, know, still... |
| 4 | i am feeling grouchy | 3 | 4 | [i, am, feeling, grouchy] | [feeling, grouchy] | [feel, grouchy] |
Joining Tokens into Sentences¶
In [18]:
train['text_cleaned'] = train['text_lemmatized'].str.join(' ')
validation['text_cleaned'] = validation['text_lemmatized'].str.join(' ')
train.head() # Final form of the dataset
Out[18]:
| text | label | length_of_text | text_tokenized | text_stop | text_lemmatized | text_cleaned | |
|---|---|---|---|---|---|---|---|
| 0 | i didnt feel humiliated | 0 | 4 | [i, didnt, feel, humiliated] | [didnt, feel, humiliated] | [didnt, feel, humiliate] | didnt feel humiliate |
| 1 | i can go from feeling so hopeless to so damned... | 0 | 21 | [i, can, go, from, feeling, so, hopeless, to, ... | [go, feeling, hopeless, damned, hopeful, aroun... | [go, feel, hopeless, damn, hopeful, around, so... | go feel hopeless damn hopeful around someone c... |
| 2 | im grabbing a minute to post i feel greedy wrong | 3 | 10 | [im, grabbing, a, minute, to, post, i, feel, g... | [im, grabbing, minute, post, feel, greedy, wrong] | [im, grab, minute, post, feel, greedy, wrong] | im grab minute post feel greedy wrong |
| 3 | i am ever feeling nostalgic about the fireplac... | 2 | 18 | [i, am, ever, feeling, nostalgic, about, the, ... | [ever, feeling, nostalgic, fireplace, know, st... | [ever, feel, nostalgic, fireplace, know, still... | ever feel nostalgic fireplace know still property |
| 4 | i am feeling grouchy | 3 | 4 | [i, am, feeling, grouchy] | [feeling, grouchy] | [feel, grouchy] | feel grouchy |
WordCloud of the Cleaned Dataset¶
In [19]:
WordCloud = WordCloud(max_words=100,
random_state=30,
collocations=True).generate(str((train['text_cleaned'])))
plt.figure(figsize=(15, 8))
plt.imshow(WordCloud, interpolation='bilinear')
plt.axis("off")
plt.show()
¶
Tokenizing with Tensorflow
In [20]:
num_words = 10000
tokenizer = Tokenizer(num_words=num_words, oov_token='<OOV>')
tokenizer.fit_on_texts(train['text_cleaned'])
word_index = tokenizer.word_index
In [21]:
num_words = 10000
tokenizer = Tokenizer(num_words=num_words, oov_token='<OOV>')
tokenizer.fit_on_texts(train['text_cleaned'])
word_index = tokenizer.word_index
In [22]:
Tokenized_train = tokenizer.texts_to_sequences(train['text_cleaned'])
Tokenized_val = tokenizer.texts_to_sequences(validation['text_cleaned'])
In [23]:
print('Non-tokenized Version: ', train['text_cleaned'][0])
print('Tokenized Version: ', tokenizer.texts_to_sequences([train['text_cleaned'][0]]))
print('--'*50)
print('Non-tokenized Version: ', train['text_cleaned'][10])
print('Tokenized Version: ', tokenizer.texts_to_sequences([train['text_cleaned'][10]]))
print('--'*50)
print('Non-tokenized Version: ', train['text'][100])
print('Tokenized Version: ', tokenizer.texts_to_sequences([train['text_cleaned'][100]]))
Non-tokenized Version: didnt feel humiliate Tokenized Version: [[56, 2, 559]] ---------------------------------------------------------------------------------------------------- Non-tokenized Version: feel like make suffer see mean something Tokenized Version: [[2, 3, 6, 393, 31, 102, 25]] ---------------------------------------------------------------------------------------------------- Non-tokenized Version: i wont let me child cry it out because i feel that loving her and lily when she was little was going to be opportunities that only lasted for those short few months Tokenized Version: [[356, 82, 289, 230, 2, 14, 3422, 13, 7, 2310, 87, 742, 263]]
Padding¶
In [24]:
maxlen = 40
Padded_train = pad_sequences(Tokenized_train, maxlen=maxlen, padding='pre')
Padded_val = pad_sequences(Tokenized_val, maxlen=maxlen, padding='pre')
print('Non-padded Version: ', tokenizer.texts_to_sequences([train['text_cleaned'][0]]))
print('Padded Version: ', Padded_train[0])
print('--'*50)
print('Non-padded Version: ', tokenizer.texts_to_sequences([train['text_cleaned'][10]]))
print('Padded Version: ', Padded_train[10])
Non-padded Version: [[56, 2, 559]] Padded Version: [ 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 56 2 559] ---------------------------------------------------------------------------------------------------- Non-padded Version: [[2, 3, 6, 393, 31, 102, 25]] Padded Version: [ 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 2 3 6 393 31 102 25]
Creating the Model¶
In [26]:
model = Sequential()
model.add(Embedding(num_words, 16))
model.add(GlobalAvgPool1D())
tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(50, return_sequences=True, activation='relu'))
model.add(Dropout(0.3))
tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(40, activation='relu', return_sequences=True))
model.add(Dropout(0.3))
tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(40, activation='relu'))
model.add(Dropout(0.3))
model.add(Dense(6, activation='softmax'))
model.compile(loss='sparse_categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
model.summary()
Model: "sequential_1"
┏━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━┓ ┃ Layer (type) ┃ Output Shape ┃ Param # ┃ ┡━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━┩ │ embedding_1 (Embedding) │ ? │ 0 (unbuilt) │ ├──────────────────────────────────────┼─────────────────────────────┼─────────────────┤ │ global_average_pooling1d_1 │ ? │ 0 (unbuilt) │ │ (GlobalAveragePooling1D) │ │ │ ├──────────────────────────────────────┼─────────────────────────────┼─────────────────┤ │ dropout_3 (Dropout) │ ? │ 0 (unbuilt) │ ├──────────────────────────────────────┼─────────────────────────────┼─────────────────┤ │ dropout_4 (Dropout) │ ? │ 0 (unbuilt) │ ├──────────────────────────────────────┼─────────────────────────────┼─────────────────┤ │ dropout_5 (Dropout) │ ? │ 0 (unbuilt) │ ├──────────────────────────────────────┼─────────────────────────────┼─────────────────┤ │ dense_1 (Dense) │ ? │ 0 (unbuilt) │ └──────────────────────────────────────┴─────────────────────────────┴─────────────────┘
Total params: 0 (0.00 B)
Trainable params: 0 (0.00 B)
Non-trainable params: 0 (0.00 B)
Training the Model¶
In [27]:
label_ = {"sadness": 0, "joy": 1, "love": 2, "anger": 3, "fear": 4, "surprise": 5}
train['label'] = train['label'].replace(label_)
validation['label'] = validation['label'].replace(label_)
train.head()
Out[27]:
| text | label | length_of_text | text_tokenized | text_stop | text_lemmatized | text_cleaned | |
|---|---|---|---|---|---|---|---|
| 0 | i didnt feel humiliated | 0 | 4 | [i, didnt, feel, humiliated] | [didnt, feel, humiliated] | [didnt, feel, humiliate] | didnt feel humiliate |
| 1 | i can go from feeling so hopeless to so damned... | 0 | 21 | [i, can, go, from, feeling, so, hopeless, to, ... | [go, feeling, hopeless, damned, hopeful, aroun... | [go, feel, hopeless, damn, hopeful, around, so... | go feel hopeless damn hopeful around someone c... |
| 2 | im grabbing a minute to post i feel greedy wrong | 3 | 10 | [im, grabbing, a, minute, to, post, i, feel, g... | [im, grabbing, minute, post, feel, greedy, wrong] | [im, grab, minute, post, feel, greedy, wrong] | im grab minute post feel greedy wrong |
| 3 | i am ever feeling nostalgic about the fireplac... | 2 | 18 | [i, am, ever, feeling, nostalgic, about, the, ... | [ever, feeling, nostalgic, fireplace, know, st... | [ever, feel, nostalgic, fireplace, know, still... | ever feel nostalgic fireplace know still property |
| 4 | i am feeling grouchy | 3 | 4 | [i, am, feeling, grouchy] | [feeling, grouchy] | [feel, grouchy] | feel grouchy |
In [28]:
early_stopping = tf.keras.callbacks.EarlyStopping(monitor='val_accuracy', mode='auto', patience=5,
restore_best_weights=True)
epochs = 100
hist = model.fit(Padded_train, train['label'], epochs=epochs,
validation_data=(Padded_val, validation['label']),
callbacks=[early_stopping])
Epoch 1/100 500/500 ━━━━━━━━━━━━━━━━━━━━ 3s 4ms/step - accuracy: 0.3058 - loss: 1.6453 - val_accuracy: 0.3520 - val_loss: 1.5789 Epoch 2/100 500/500 ━━━━━━━━━━━━━━━━━━━━ 2s 3ms/step - accuracy: 0.3328 - loss: 1.5858 - val_accuracy: 0.3520 - val_loss: 1.5693 Epoch 3/100 500/500 ━━━━━━━━━━━━━━━━━━━━ 2s 3ms/step - accuracy: 0.3493 - loss: 1.5715 - val_accuracy: 0.3520 - val_loss: 1.5570 Epoch 4/100 500/500 ━━━━━━━━━━━━━━━━━━━━ 2s 3ms/step - accuracy: 0.3557 - loss: 1.5604 - val_accuracy: 0.3520 - val_loss: 1.5407 Epoch 5/100 500/500 ━━━━━━━━━━━━━━━━━━━━ 1s 3ms/step - accuracy: 0.3703 - loss: 1.5371 - val_accuracy: 0.3540 - val_loss: 1.5132 Epoch 6/100 500/500 ━━━━━━━━━━━━━━━━━━━━ 1s 3ms/step - accuracy: 0.4169 - loss: 1.4933 - val_accuracy: 0.3655 - val_loss: 1.4713 Epoch 7/100 500/500 ━━━━━━━━━━━━━━━━━━━━ 1s 3ms/step - accuracy: 0.4509 - loss: 1.4388 - val_accuracy: 0.4920 - val_loss: 1.4138 Epoch 8/100 500/500 ━━━━━━━━━━━━━━━━━━━━ 1s 3ms/step - accuracy: 0.4872 - loss: 1.3717 - val_accuracy: 0.6000 - val_loss: 1.3438 Epoch 9/100 500/500 ━━━━━━━━━━━━━━━━━━━━ 2s 3ms/step - accuracy: 0.5338 - loss: 1.2959 - val_accuracy: 0.5860 - val_loss: 1.2628 Epoch 10/100 500/500 ━━━━━━━━━━━━━━━━━━━━ 2s 3ms/step - accuracy: 0.5735 - loss: 1.2076 - val_accuracy: 0.6550 - val_loss: 1.1799 Epoch 11/100 500/500 ━━━━━━━━━━━━━━━━━━━━ 2s 4ms/step - accuracy: 0.6243 - loss: 1.1263 - val_accuracy: 0.6765 - val_loss: 1.1032 Epoch 12/100 500/500 ━━━━━━━━━━━━━━━━━━━━ 2s 3ms/step - accuracy: 0.6604 - loss: 1.0501 - val_accuracy: 0.7540 - val_loss: 1.0271 Epoch 13/100 500/500 ━━━━━━━━━━━━━━━━━━━━ 2s 3ms/step - accuracy: 0.6977 - loss: 0.9732 - val_accuracy: 0.7620 - val_loss: 0.9473 Epoch 14/100 500/500 ━━━━━━━━━━━━━━━━━━━━ 1s 3ms/step - accuracy: 0.7296 - loss: 0.9100 - val_accuracy: 0.7995 - val_loss: 0.8737 Epoch 15/100 500/500 ━━━━━━━━━━━━━━━━━━━━ 1s 3ms/step - accuracy: 0.7511 - loss: 0.8390 - val_accuracy: 0.8315 - val_loss: 0.8230 Epoch 16/100 500/500 ━━━━━━━━━━━━━━━━━━━━ 1s 3ms/step - accuracy: 0.7692 - loss: 0.8013 - val_accuracy: 0.8265 - val_loss: 0.7579 Epoch 17/100 500/500 ━━━━━━━━━━━━━━━━━━━━ 1s 3ms/step - accuracy: 0.7878 - loss: 0.7368 - val_accuracy: 0.8360 - val_loss: 0.7103 Epoch 18/100 500/500 ━━━━━━━━━━━━━━━━━━━━ 1s 3ms/step - accuracy: 0.7981 - loss: 0.6935 - val_accuracy: 0.8425 - val_loss: 0.6640 Epoch 19/100 500/500 ━━━━━━━━━━━━━━━━━━━━ 1s 3ms/step - accuracy: 0.8067 - loss: 0.6642 - val_accuracy: 0.8410 - val_loss: 0.6254 Epoch 20/100 500/500 ━━━━━━━━━━━━━━━━━━━━ 2s 3ms/step - accuracy: 0.8110 - loss: 0.6326 - val_accuracy: 0.8425 - val_loss: 0.5933 Epoch 21/100 500/500 ━━━━━━━━━━━━━━━━━━━━ 1s 3ms/step - accuracy: 0.8297 - loss: 0.5910 - val_accuracy: 0.8535 - val_loss: 0.5599 Epoch 22/100 500/500 ━━━━━━━━━━━━━━━━━━━━ 2s 3ms/step - accuracy: 0.8396 - loss: 0.5555 - val_accuracy: 0.8655 - val_loss: 0.5327 Epoch 23/100 500/500 ━━━━━━━━━━━━━━━━━━━━ 1s 2ms/step - accuracy: 0.8368 - loss: 0.5369 - val_accuracy: 0.8660 - val_loss: 0.5125 Epoch 24/100 500/500 ━━━━━━━━━━━━━━━━━━━━ 1s 3ms/step - accuracy: 0.8404 - loss: 0.5210 - val_accuracy: 0.8585 - val_loss: 0.5013 Epoch 25/100 500/500 ━━━━━━━━━━━━━━━━━━━━ 1s 3ms/step - accuracy: 0.8482 - loss: 0.5153 - val_accuracy: 0.8535 - val_loss: 0.4860 Epoch 26/100 500/500 ━━━━━━━━━━━━━━━━━━━━ 2s 3ms/step - accuracy: 0.8547 - loss: 0.4789 - val_accuracy: 0.8665 - val_loss: 0.4602 Epoch 27/100 500/500 ━━━━━━━━━━━━━━━━━━━━ 1s 3ms/step - accuracy: 0.8620 - loss: 0.4593 - val_accuracy: 0.8700 - val_loss: 0.4458 Epoch 28/100 500/500 ━━━━━━━━━━━━━━━━━━━━ 1s 3ms/step - accuracy: 0.8614 - loss: 0.4567 - val_accuracy: 0.8710 - val_loss: 0.4349 Epoch 29/100 500/500 ━━━━━━━━━━━━━━━━━━━━ 1s 3ms/step - accuracy: 0.8649 - loss: 0.4433 - val_accuracy: 0.8710 - val_loss: 0.4268 Epoch 30/100 500/500 ━━━━━━━━━━━━━━━━━━━━ 2s 3ms/step - accuracy: 0.8709 - loss: 0.4332 - val_accuracy: 0.8740 - val_loss: 0.4163 Epoch 31/100 500/500 ━━━━━━━━━━━━━━━━━━━━ 1s 3ms/step - accuracy: 0.8707 - loss: 0.4164 - val_accuracy: 0.8720 - val_loss: 0.4064 Epoch 32/100 500/500 ━━━━━━━━━━━━━━━━━━━━ 2s 3ms/step - accuracy: 0.8722 - loss: 0.4027 - val_accuracy: 0.8805 - val_loss: 0.4007 Epoch 33/100 500/500 ━━━━━━━━━━━━━━━━━━━━ 2s 3ms/step - accuracy: 0.8767 - loss: 0.3934 - val_accuracy: 0.8755 - val_loss: 0.3946 Epoch 34/100 500/500 ━━━━━━━━━━━━━━━━━━━━ 1s 3ms/step - accuracy: 0.8841 - loss: 0.3898 - val_accuracy: 0.8720 - val_loss: 0.3993 Epoch 35/100 500/500 ━━━━━━━━━━━━━━━━━━━━ 2s 3ms/step - accuracy: 0.8786 - loss: 0.3872 - val_accuracy: 0.8740 - val_loss: 0.3856 Epoch 36/100 500/500 ━━━━━━━━━━━━━━━━━━━━ 1s 3ms/step - accuracy: 0.8826 - loss: 0.3690 - val_accuracy: 0.8805 - val_loss: 0.3820 Epoch 37/100 500/500 ━━━━━━━━━━━━━━━━━━━━ 1s 3ms/step - accuracy: 0.8891 - loss: 0.3645 - val_accuracy: 0.8810 - val_loss: 0.3823 Epoch 38/100 500/500 ━━━━━━━━━━━━━━━━━━━━ 2s 3ms/step - accuracy: 0.8811 - loss: 0.3649 - val_accuracy: 0.8735 - val_loss: 0.3776 Epoch 39/100 500/500 ━━━━━━━━━━━━━━━━━━━━ 1s 3ms/step - accuracy: 0.8877 - loss: 0.3523 - val_accuracy: 0.8720 - val_loss: 0.3742 Epoch 40/100 500/500 ━━━━━━━━━━━━━━━━━━━━ 1s 3ms/step - accuracy: 0.8911 - loss: 0.3456 - val_accuracy: 0.8810 - val_loss: 0.3730 Epoch 41/100 500/500 ━━━━━━━━━━━━━━━━━━━━ 1s 3ms/step - accuracy: 0.8968 - loss: 0.3352 - val_accuracy: 0.8700 - val_loss: 0.3899 Epoch 42/100 500/500 ━━━━━━━━━━━━━━━━━━━━ 2s 3ms/step - accuracy: 0.8944 - loss: 0.3318 - val_accuracy: 0.8755 - val_loss: 0.3678
Train and Validation Loss Graphs¶
In [29]:
plt.figure(figsize=(15, 8))
plt.plot(hist.history['loss'], label='Train Loss')
plt.plot(hist.history['val_loss'], label='Validation Loss')
plt.title('Train and Validation Loss Graphs')
plt.xlabel('Epochs')
plt.ylabel('Loss')
plt.legend()
Out[29]:
<matplotlib.legend.Legend at 0x2981b779310>
Preparing the Test Data¶
In [30]:
test['text_tokenized'] = test['text'].apply(tokenization)
test['text_stop'] = test['text_tokenized'].apply(stopwords_remove)
test['text_lemmatized'] = test['text_stop'].apply(lemmatization)
test['text_cleaned'] = test['text_lemmatized'].str.join(' ')
Tokenized_test = tokenizer.texts_to_sequences(test['text_cleaned'])
Padded_test = pad_sequences(Tokenized_test, maxlen=maxlen, padding='pre')
test['label'] = test['label'].replace(label_)
test_evaluate = model.evaluate(Padded_test, test['label'])
63/63 ━━━━━━━━━━━━━━━━━━━━ 0s 2ms/step - accuracy: 0.8779 - loss: 0.3730
In [31]:
test.head()
Out[31]:
| text | label | text_tokenized | text_stop | text_lemmatized | text_cleaned | |
|---|---|---|---|---|---|---|
| 0 | im feeling rather rotten so im not very ambiti... | 0 | [im, feeling, rather, rotten, so, im, not, ver... | [im, feeling, rather, rotten, im, ambitious, r... | [im, feel, rather, rotten, im, ambitious, right] | im feel rather rotten im ambitious right |
| 1 | im updating my blog because i feel shitty | 0 | [im, updating, my, blog, because, i, feel, shi... | [im, updating, blog, feel, shitty] | [im, update, blog, feel, shitty] | im update blog feel shitty |
| 2 | i never make her separate from me because i do... | 0 | [i, never, make, her, separate, from, me, beca... | [never, make, separate, ever, want, feel, like... | [never, make, separate, ever, want, feel, like... | never make separate ever want feel like ashamed |
| 3 | i left with my bouquet of red and yellow tulip... | 1 | [i, left, with, my, bouquet, of, red, and, yel... | [left, bouquet, red, yellow, tulips, arm, feel... | [leave, bouquet, red, yellow, tulips, arm, fee... | leave bouquet red yellow tulips arm feel sligh... |
| 4 | i was feeling a little vain when i did this one | 0 | [i, was, feeling, a, little, vain, when, i, di... | [feeling, little, vain, one] | [feel, little, vain, one] | feel little vain one |
Making Predictions in the Test Data¶
In [32]:
def make_predictions(text_input):
text_input = str(text_input)
text_input = tokenization(text_input)
text_input = stopwords_remove(text_input)
text_input = lemmatization(text_input)
text_input = ' '.join(text_input)
text_input = tokenizer.texts_to_sequences([text_input])
text_input = pad_sequences(text_input, maxlen=maxlen, padding='pre')
text_input = np.argmax(model.predict(text_input))
if text_input == 0:
print('Predicted Emotion: Sadness')
elif text_input == 1:
print('Predicted Emotion: Joy')
elif text_input == 2:
print('Predicted Emotion: Love')
elif text_input == 3:
print('Predicted Emotion: Anger')
elif text_input == 4:
print('Predicted Emotion: Fear')
else:
print('Predicted Emotion: Surprise')
return text_input
label_ = {0: "Sadness", 1: "Joy", 2: "Love", 3: "Anger", 4: "Fear", 5: "Surprise"}
test['label'] = test['label'].replace(label_)
# Randomly chosen Test Dataset data points
i = random.randint(0, len(test) - 1)
print('Test Text:', test['text'][i])
print(' ')
print('Actual Emotion:', test['label'][i])
make_predictions(test['text'][i])
print('-'*50)
print('Test Text:', test['text'][i+1])
print(' ')
print('Actual Emotion:', test['label'][i+1])
make_predictions(test['text'][i+1])
Test Text: i am feeling ok lots of bruising to my arms where they decided to remove blood from me Actual Emotion: Joy 1/1 ━━━━━━━━━━━━━━━━━━━━ 0s 86ms/step Predicted Emotion: Joy -------------------------------------------------- Test Text: i just don t feel that the others are worthwhile Actual Emotion: Joy 1/1 ━━━━━━━━━━━━━━━━━━━━ 0s 50ms/step Predicted Emotion: Joy
Out[32]:
1
Confusion Matrix of the Test Data¶
In [33]:
from sklearn.metrics import confusion_matrix
import numpy as np
label_ = {"Sadness": 0, "Joy": 1, "Love": 2, "Anger": 3, "Fear": 4, "Surprise": 5}
test['label'] = test['label'].replace(label_)
predictions = model.predict(Padded_test)
pred = np.argmax(predictions, axis=1)
# pred = model.predict_classes(Padded_test)
plt.figure(figsize=(15, 8))
conf_mat = confusion_matrix(test['label'].values, pred)
conf_mat = pd.DataFrame(conf_mat, columns=np.unique(test['label']), index=np.unique(pred))
conf_mat.index.name = 'Actual'
conf_mat.columns.name = 'Predicted'
sns.heatmap(conf_mat, annot=True, fmt='g')
plt.title('Confusion Matrix of the Test Data', fontsize=14)
plt.show()
1/63 ━━━━━━━━━━━━━━━━━━━━ 5s 91ms/step
C:\Users\Mayank\AppData\Local\Temp\ipykernel_12756\422763479.py:5: FutureWarning:
Downcasting behavior in `replace` is deprecated and will be removed in a future version. To retain the old behavior, explicitly call `result.infer_objects(copy=False)`. To opt-in to the future behavior, set `pd.set_option('future.no_silent_downcasting', True)`
63/63 ━━━━━━━━━━━━━━━━━━━━ 0s 1ms/step
In [34]:
make_predictions("She’s flying high after the successful product launch.")
1/1 ━━━━━━━━━━━━━━━━━━━━ 0s 35ms/step Predicted Emotion: Joy
Out[34]:
1
In [35]:
make_predictions("I’m going to have the first meeting with a big client tomorrow, and I’m feeling butterflies in my stomach")
1/1 ━━━━━━━━━━━━━━━━━━━━ 0s 33ms/step Predicted Emotion: Love
Out[35]:
2
In [36]:
make_predictions("I just asked one question to confirm his request, and my boss bit my head off.")
1/1 ━━━━━━━━━━━━━━━━━━━━ 0s 42ms/step Predicted Emotion: Anger
Out[36]:
3
In [37]:
make_predictions('No one told you when to run, you missed the starting gun')
1/1 ━━━━━━━━━━━━━━━━━━━━ 0s 43ms/step Predicted Emotion: Sadness
Out[37]:
0
In [38]:
make_predictions("Sometimes the people who appear to be the most confident are actually afraid of their own shadows.")
1/1 ━━━━━━━━━━━━━━━━━━━━ 0s 66ms/step Predicted Emotion: Fear
Out[38]:
4
In [39]:
make_predictions("I'm really impressed that Ashley can speak 7 languages, whereas I only speak one!")
1/1 ━━━━━━━━━━━━━━━━━━━━ 0s 62ms/step Predicted Emotion: Surprise
Out[39]:
5
In [ ]: